Description

This is the report of the analysis made for the paper (TITLE) AND AUTHORS. INSERT ABSTRACT

Importing data

Importing data and filtering out those genes with cpm lesser than 1. We use the filtered.data method in NOISeq package.

countMatrix <- ReadDataFrameFromTsv(file.name.path="../../data/refSEQ_countMatrix.txt")
## ../../data/refSEQ_countMatrix.txt read from disk!
# head(countMatrix)

designMatrix <- ReadDataFrameFromTsv(file.name.path="../../design/all_samples_short_names_noHC7.tsv.csv")
## ../../design/all_samples_short_names_noHC7.tsv.csv read from disk!
# head(designMatrix)

filteredCountsProp <- filterLowCounts(counts.dataframe=countMatrix, 
                                    is.normalized=FALSE,
                                    design.dataframe=designMatrix,
                                    cond.col.name="gcondition",
                                    method.type="Proportion")
## features dimensions before normalization: 27179
## Filtering out low count features...
## 14495 features are to be kept for differential expression analysis with filtering method 3

Control Genes

Negative control genes

Loading Negative Control Genes to normalize data

library(readxl)

sd.ctrls <- read_excel(path="../../data/controls/Additional File 4 full list of BMC genomics SD&RS2.xlsx", sheet=1)
sd.ctrls <- sd.ctrls[order(sd.ctrls$adj.P.Val),]

sd.neg.ctrls <- sd.ctrls[sd.ctrls$adj.P.Val > 0.9, ]

sd.neg.ctrls <- sd.neg.ctrls$`MGI Symbol`
sd.neg.ctrls <- sd.neg.ctrls[-which(is.na(sd.neg.ctrls))]

int.neg.ctrls <- sd.neg.ctrls
int.neg.ctrls <- unique(int.neg.ctrls)
neg.map <- convertGenesViaMouseDb(gene.list=int.neg.ctrls, fromType="SYMBOL",
                                "ENTREZID")
# sum(is.na(neg.map$ENTREZID))
neg.ctrls.entrez <- as.character(neg.map$ENTREZID)


ind.ctrls <- which(rownames(filteredCountsProp) %in% neg.ctrls.entrez)
counts.neg.ctrls <- filteredCountsProp[ind.ctrls,]

positive control genes

Loading Positive Control Genes to detect them during the differential expression step.

## sleep deprivation
sd.lit.pos.ctrls <- read_excel("../../data/controls/SD_RS_PosControls_final.xlsx", 
                        sheet=1)
colnames(sd.lit.pos.ctrls) <- sd.lit.pos.ctrls[1,]
sd.lit.pos.ctrls <- sd.lit.pos.ctrls[-1,]


sd.est.pos.ctrls <- read_excel("../../data/controls/SD_RS_PosControls_final.xlsx", 
                        sheet=3)

sd.pos.ctrls <- cbind(sd.est.pos.ctrls$`MGI Symbol`, "est")
sd.pos.ctrls <- rbind(sd.pos.ctrls, cbind(sd.lit.pos.ctrls$Gene, "lit"))

sd.pos.ctrls <- sd.pos.ctrls[-which(duplicated(sd.pos.ctrls[,1])),]
sd.pos.ctrls <- sd.pos.ctrls[-which(is.na(sd.pos.ctrls[,1])),]

Normalizations

TMM Normalization

Normalizing data with TMM, as implemented in edgeR package, and plotting a PCA and an RLE plot of them.

normPropCountsTmm <- NormalizeData(data.to.normalize=filteredCountsProp, 
                                    norm.type="tmm", 
                                    design.matrix=designMatrix)

TMM + RUVs Normalization

Applying a RUVs method of RUVSeq package on normalized data, in order to adjust the counts for the unwanted variation. And of corse we plot a PCA and an RLE plot on these data.

library(RUVSeq)
neg.ctrl.list <- rownames(counts.neg.ctrls)
groups <- makeGroups(designMatrix$gcondition)
ruvedSExprData <- RUVs(as.matrix(round(normPropCountsTmm)), cIdx=neg.ctrl.list,
                       scIdx=groups, k=5)

normExprData <- ruvedSExprData$normalizedCounts

ggp1 <- PlotPCAPlotlyFunction(counts.data.frame=log1p(normExprData), 
                    design.matrix=designMatrix, shapeColname="condition", 
                    colorColname="genotype", xPCA="PC1", yPCA="PC2", 
                    plotly.flag=FALSE, show.plot.flag=FALSE, save.plot=FALSE,
                    prefix.plot=NULL)
## [1] FALSE
# ggp2 <- PlotPCAPlotlyFunction(counts.data.frame=log1p(normExprData), 
#                     design.matrix=designMatrix, shapeColname="condition", 
#                     colorColname="genotype", xPCA="PC2", yPCA="PC3", 
#                     plotly.flag=FALSE, show.plot.flag=FALSE, save.plot=FALSE,
#                     prefix.plot=NULL)
# ggp3 <- PlotPCAPlotlyFunction(counts.data.frame=log1p(normExprData), 
#                     design.matrix=designMatrix, shapeColname="condition", 
#                     colorColname="genotype", xPCA="PC1", yPCA="PC3", 
#                     plotly.flag=FALSE, show.plot.flag=FALSE, save.plot=FALSE,
#                     prefix.plot=NULL)
# plotly::subplot(ggp1, ggp2, ggp3, nrows=2, margin = 0.1, titleX=TRUE, titleY=TRUE)
ggplotly(ggp1)
dir.create("plots")
save_plot(filename="plots/PCA.pdf", plot=ggp1)

pal <- RColorBrewer::brewer.pal(9, "Set1")
plotRLE(normExprData, outline=FALSE, col=pal[designMatrix$gcondition])

edgeR Differential Expression Analysis

Making differential expression analysis with edgeR package on four different contrasts.

Here is a brief legend:

  • WTHC5: Wild Type Home Cage Control 5 days
  • WTSD5: Wild Type Sleep Deprivation 5 days.
  • WTRS2: Wild Type Recovery Sleep 2 days.
  • S3HC5: Shank3 Home Cage Control 5 days.
  • S3SD5: Shank3 Sleep Deprivation 5 days.
  • S3RS2: Shank3 Recovery Sleep 2 days.
padj.thr <- 0.05
venn.padgj.thr <- 0.1
desMat <- cbind(designMatrix, ruvedSExprData$W)
colnames(desMat) <- c(colnames(designMatrix), colnames(ruvedSExprData$W))

# cc <- c("WTSD5 - WTHC5", "S3HC5 - WTHC5",
#         "S3SD5 - WTSD5", "S3SD5 - S3HC5")

cc <- c("S3HC5 - WTHC5", "S3SD5 - WTSD5", "S3RS2 - WTRS2")


rescList1 <- applyEdgeR(counts=filteredCountsProp, design.matrix=desMat,
                        factors.column="gcondition", 
                        weight.columns=c("W_1", "W_2", "W_3", "W_4", "W_5"),
                        contrasts=cc, useIntercept=FALSE, p.threshold=1,
                        is.normalized=FALSE, verbose=TRUE)
names <- names(rescList1)
rescList1 <- lapply(seq_along(rescList1), function(i) 
{
    attachMeans(normalized.counts=normExprData, design.matrix=desMat, 
                factor.column="gcondition", contrast.name=names(rescList1)[i],
                de.results=rescList1[[i]])
})
names(rescList1) <- names

Shank3 Home Cage control VS Wild Type Home Cage Controls

volcano plot

A volcano plot of differential expressed genes.

res.o.map1 <- convertGenesViaMouseDb(gene.list=rownames(rescList1[[1]]), 
                            fromType="ENTREZID")

res.o <- attachGeneColumnToDf(mainDf=rescList1[[1]],
                                genesMap=res.o.map1,
                                rowNamesIdentifier="ENTREZID",
                                mapFromIdentifier="ENTREZID",
                                mapToIdentifier="SYMBOL")
WriteDataFrameAsTsv(data.frame.to.save=res.o, 
                    file.name.path=paste0(names(rescList1)[1], "_edgeR"))

vp <- luciaVolcanoPlot(res.o, positive.controls.df=NULL, prefix=names(rescList1)[1], 
                    threshold=padj.thr)
ggplotly(vp)
de <- sum(res.o$FDR < padj.thr)
nde <- sum(res.o$FDR >= padj.thr)
detable <- cbind(de,nde)
rownames(detable) <- names(rescList1)[1]
ddetable <- detable

tot.ctrls <- dim(sd.pos.ctrls)[1]
idx.pc <- which(tolower(res.o$gene) %in% tolower(sd.pos.ctrls[,1]))
tot.pc.de <- sum(res.o$FDR[idx.pc] < padj.thr)
tot.pc.nde <- length(idx.pc) - tot.pc.de
pos.df <- cbind(tot.ctrls, tot.pc.de, tot.pc.nde)
colnames(pos.df) <- c("total_p.ctrl", "p.ctrl_de_mapped", 
                    "p.ctrl_notde_mapped")
rownames(pos.df) <- names(rescList1)[1]

kowthc5 <- res.o[which(res.o$FDR < padj.thr),]
kowthc5.sign.genes.entrez <- rownames(res.o)[which(res.o$FDR < venn.padgj.thr)]

Shank3 Sleed Deprivation VS Wild Type Sleep Deprivation

volcano plot

A volcano plot of differential expressed genes.

rs2.o.map <- convertGenesViaMouseDb(gene.list=rownames(rescList1[[2]]), 
                            fromType="ENTREZID")

res.rs2.o <- attachGeneColumnToDf(mainDf=rescList1[[2]],
                                genesMap=rs2.o.map,
                                rowNamesIdentifier="ENTREZID",
                                mapFromIdentifier="ENTREZID",
                                mapToIdentifier="SYMBOL")
WriteDataFrameAsTsv(data.frame.to.save=res.rs2.o, 
                    file.name.path=paste0(names(rescList1)[2], "_edgeR"))

vp <- luciaVolcanoPlot(res.rs2.o, positive.controls.df=sd.pos.ctrls, 
                       prefix=names(rescList1)[2], 
                       threshold=padj.thr)
ggplotly(vp)
de <- sum(res.rs2.o$FDR < padj.thr)
nde <- sum(res.rs2.o$FDR >= padj.thr)
detable <- cbind(de,nde)
rownames(detable) <- names(rescList1)[2]
ddetable <- rbind(ddetable, detable)

kowtsd5 <- res.rs2.o[which(res.rs2.o$FDR < padj.thr),]
kowtsd5.sign.genes.entrez <- rownames(res.rs2.o)[which(res.rs2.o$FDR < venn.padgj.thr)]

Shank3 Recovery Sleep VS Wild Type Recovery Sleep

volcano plot

A volcano plot of differential expressed genes.

res.o.map <- convertGenesViaMouseDb(gene.list=rownames(rescList1[[3]]), 
                            fromType="ENTREZID")

res.o <- attachGeneColumnToDf(mainDf=rescList1[[3]],
                                genesMap=res.o.map,
                                rowNamesIdentifier="ENTREZID",
                                mapFromIdentifier="ENTREZID",
                                mapToIdentifier="SYMBOL")

WriteDataFrameAsTsv(data.frame.to.save=res.o, 
                    file.name.path=paste0(names(rescList1)[3], "_edgeR"))

vp <- luciaVolcanoPlot(res.o, positive.controls.df=NULL, 
                        prefix=names(rescList1)[3], threshold=padj.thr)
ggplotly(vp)
de <- sum(res.o$FDR < padj.thr)
nde <- sum(res.o$FDR >= padj.thr)
detable <- cbind(de,nde)
rownames(detable) <- names(rescList1)[3]
ddetable <- rbind(ddetable, detable)

kowtsrs2 <- res.o[which(res.o$FDR < padj.thr),]
kowtrs2.sign.genes.entrez <- rownames(res.o)[which(res.o$FDR < venn.padgj.thr)]

DE TABLE + Positive Controls table

We present a summarization of the results. The first table is a summarization on how many genes are Differentially Expressed. The second table explains on the first column how many positive controls we have, on the second column how many positive controls have been identified over the differentially expressed genes, and, finally, on the third column how many positive controls have beed identified on the NOT differentially expressed genes.

ddetable
##               de   nde
## S3HC5 - WTHC5 28 14467
## S3SD5 - WTSD5 55 14440
## S3RS2 - WTRS2 23 14472
pos.df
##               total_p.ctrl p.ctrl_de_mapped p.ctrl_notde_mapped
## S3HC5 - WTHC5          579                3                 553

Venn Diagram

KOHC5-WTHC5 vs KOSD5-WTSD5

We take the results of two contrasts.

Knock Out Sleed Deprivation VS Wild Type Sleep Deprivation and Knock Out Home Cage control VS Wild Type Home Cage Controls . And plot the results in a Venn Diagram

source("../../R/venn2.R")
source("../../R/venn3.R")
gene.map <- convertGenesViaMouseDb(gene.list=rownames(normExprData),
                                   fromType="ENTREZID", toType="SYMBOL")
venn <- Venn2de(x=kowthc5.sign.genes.entrez, y=kowtsd5.sign.genes.entrez, 
        label1="S3HC5_WTHC5", label2="S3SD5_WTSD5",
        title="S3HC5_WTHC5 venn S3SD5_WTSD5", plot.dir="./",
        conversion.map=gene.map)

We take the results of three contrasts.

Shank3 Sleed Deprivation VS Wild Type Sleep Deprivation and Shank3 Home Cage Control VS Wild Type Home Cage Controls and Shank3 Home Recovery Sleep VS Wild Type Recovery Sleep. And plot the results in a Venn Diagram

venn3 <- Venn3de(x=kowthc5.sign.genes.entrez, y=kowtsd5.sign.genes.entrez, 
                 z=kowtrs2.sign.genes.entrez,
        label1="S3HC5_WTHC5", label2="S3SD5_WTSD5", label3="S3RS2_WTRS2",
        title="S3HC5_WTHC5 - S3SD5_WTSD5 - S3RS2_WTRS2", enrich.lists.flag=FALSE,
        plot.dir="./",
        conversion.map=gene.map)

Heatmaps

Union of genes

An heatmap of the union of the genes identified.

source("../../R/heatmapFunctions.R")
de.genes.entr <- union(rownames(venn$int), rownames(venn$XnoY))
de.genes.entr <- union(de.genes.entr, rownames(venn$YnoX))

de.genes.entr <- union(kowthc5.sign.genes.entrez, kowtsd5.sign.genes.entrez)
de.genes.entr <- union(de.genes.entr, kowtrs2.sign.genes.entrez)

gene.map <- convertGenesViaMouseDb(gene.list=de.genes.entr, 
                            fromType="ENTREZID")
de.genes.symb <- attachGeneColumnToDf(as.data.frame(de.genes.entr, 
                                                    row.names=de.genes.entr), 
                                    genesMap=gene.map, 
                                    rowNamesIdentifier="ENTREZID", 
                                    mapFromIdentifier="ENTREZID",
                                    mapToIdentifier="SYMBOL")

# de.genes.symb[which(is.na(de.genes.symb$gene)),]
de.genes.symb$gene[which(de.genes.symb$de.genes.entr=="100039826")]  <- "Gm2444" ## not annotated in ncbi
de.genes.symb$gene[which(de.genes.symb$de.genes.entr=="210541")]  <- "Entrez:210541" ## not annotated in ncbi


de.genes.counts <- normExprData[match(de.genes.symb$de.genes.entr, rownames(normExprData)),]
rownames(de.genes.counts) <- de.genes.symb$gene

de.gene.means <- computeGeneMeansOverGroups(counts=de.genes.counts,
                            design=designMatrix, groupColumn="gcondition")

library(gplots)
library(clusterExperiment)
color.palette = clusterExperiment::seqPal3#c("black", "yellow")
pal <- colorRampPalette(color.palette)(n = 1000)


library(pheatmap)
filter2 <- rowMeans(de.gene.means)>0
filter <- apply(de.gene.means, 1, function(x) log(x[4]/x[3]) * log(x[2]/x[1]) < 0)
filter[is.na(filter)] <- FALSE

heatmap K-means k=5

with k=5 the samples S3RS2 seems to be mixed with the S3SD5 ones.

ph1 <- pheatmap(log(de.genes.counts[filter2,]+1), cluster_cols=TRUE, scale="row", 
                color=pal, border_color=NA, fontsize_row=5)

ph1 <- pheatmap(log(de.genes.counts[filter2,]+1), cluster_cols=TRUE, scale="row", 
            color=pal, border_color=NA, fontsize_row=10, kmeans_k=5)

clusterized.genes <- as.data.frame(ph1$kmeans$cluster)

gene.map <- convertGenesViaMouseDb(gene.list=rownames(clusterized.genes), fromType="SYMBOL")
converted.clusterized.gens <- attachGeneColumnToDf(mainDf=clusterized.genes, genesMap=gene.map, 
                    rowNamesIdentifier="SYMBOL", mapFromIdentifier="SYMBOL", mapToIdentifier="ENTREZID")

converted.clusterized.gens$gene[which(rownames(converted.clusterized.gens)=="Gm2444")]  <- "100039826" ## not annotated in ncbi
converted.clusterized.gens$gene[which(rownames(converted.clusterized.gens)=="Entrez:210541")]  <- "210541" ## not annotated in ncbi
converted.clusterized.gens <- converted.clusterized.gens[order(converted.clusterized.gens$`ph1$kmeans$cluster`),]
WriteDataFrameAsTsv(data.frame.to.save=converted.clusterized.gens, file.name.path="clustered_genes_by_kmeans5")

heatmap K-means k=6

Using k=6 we maintain the clustering of S3RS2 samples and S3SD5 in two different clusters of columns.

ph1 <- pheatmap(log(de.genes.counts[filter2,]+1), cluster_cols=TRUE, scale="row", 
                color=pal, border_color=NA, fontsize_row=5)

ph1 <- pheatmap(log(de.genes.counts[filter2,]+1), cluster_cols=TRUE, scale="row", 
            color=pal, border_color=NA, fontsize_row=10, kmeans_k=6)

clusterized.genes <- as.data.frame(ph1$kmeans$cluster)

gene.map <- convertGenesViaMouseDb(gene.list=rownames(clusterized.genes), fromType="SYMBOL")
converted.clusterized.gens <- attachGeneColumnToDf(mainDf=clusterized.genes, genesMap=gene.map, 
                    rowNamesIdentifier="SYMBOL", mapFromIdentifier="SYMBOL", mapToIdentifier="ENTREZID")

converted.clusterized.gens$gene[which(rownames(converted.clusterized.gens)=="Gm2444")]  <- "100039826" ## not annotated in ncbi
converted.clusterized.gens$gene[which(rownames(converted.clusterized.gens)=="Entrez:210541")]  <- "210541" ## not annotated in ncbi
converted.clusterized.gens <- converted.clusterized.gens[order(converted.clusterized.gens$`ph1$kmeans$cluster`),]
WriteDataFrameAsTsv(data.frame.to.save=converted.clusterized.gens, file.name.path="clustered_genes_by_kmeans6")